BOT_NAME = "nirsoft_crawler"

SPIDER_MODULES = ["nirsoft_crawler.spiders"]
NEWSPIDER_MODULE = "nirsoft_crawler.spiders"

# 模拟真实浏览器的User-Agent
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.62"

# 完整的请求头配置，增强反爬绕过能力
DEFAULT_REQUEST_HEADERS = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Pragma': 'no-cache',
}

# 关闭robots.txt限制
ROBOTSTXT_OBEY = False

# 下载并发与延迟策略（关键优化点）
CONCURRENT_REQUESTS = 1          # 降低并发量，避免触发反爬
DOWNLOAD_DELAY = 8               # 增加下载延迟到8秒
COOKIES_ENABLED = False
TELNETCONSOLE_ENABLED = False

# 智能限速配置
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5     # 初始延迟
AUTOTHROTTLE_MAX_DELAY = 20      # 最大延迟（提高到20秒）
AUTOTHROTTLE_TARGET_CONCURRENCY = 0.5  # 降低目标并发量

# 文件下载管道
ITEM_PIPELINES = {
    'nirsoft_crawler.pipelines.NirsoftFilesPipeline': 300,
}

FILES_STORE = r"D:\课程设计汇总\数据采集课程设计\下载"
FILES_URLS_FIELD = 'file_urls'
FILES_RESULT_FIELD = 'files'

# 重定向与重试策略
REDIRECT_ENABLED = True           # 启用重定向处理
REDIRECT_MAX_TIMES = 7            # 增加重定向尝试次数
REDIRECT_PRIORITY_ADJUST = -2     # 重定向请求优先级调整
MEDIA_ALLOW_REDIRECTS = True
RETRY_TIMES = 3
RETRY_HTTP_CODES = [500, 502, 503, 504, 400, 403, 404, 302]

RETRY_ENABLED = True
RETRY_TIMES = 5                   # 增加重试次数到5次
DOWNLOAD_TIMEOUT = 60             # 延长下载超时到60秒

# 日志配置（调试时使用DEBUG，正式环境用INFO）
LOG_LEVEL = 'INFO'
#LOG_FILE = 'scrapy_log.txt'       # 记录日志到文件，便于分析

# 禁用默认中间件，减少干扰
DOWNLOADER_MIDDLEWARES = {
    'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware': None,
    'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 800,
}